Celestin Apprentice 4

home *** CD-ROM | disk | FTP | other *** search

/ Celestin Apprentice 4 / Apprentice-Release4.iso / Source Code / Libraries / Scripting / Source / StevesRgExp.cpp < prev next >

Wrap

C/C++ Source or Header | 1995-09-09 | 13.2 KB | 415 lines | [TEXT/MPCC]

/*--------------------------------------------------------------- Copyright 1995, Steve Israelson I own this code. You are free to use this code in any software you want. You may not sell this source code at all, you can sell your product though. If you want to include this code in any code collection (CD-Roms etc) this is OK as long as I get a complimentary copy. Steve. Regular expression matching. The RegExp::Parse() function will create a new regular expression object based on your input string. You can then ask this object if it matches a string and it will return true or false. Groups of these objects can perform miracles. These are the regular expression meta characters. I have no text defining what the standard characters are, so I made these up from memory. You can add more if you want, its easy. ^ Begining of line. $ End of line. [] Set. a-z A range of characters in a set. ~ The following characters are not in the set. * 0 or more of the previous pattern. + 1 or more of the previous pattern. . Any character. | Or. Used between any two patterns. NOT IMPLEMENTED!!!!! & Parameter. The previous pattern is a parameter. / The next character is a literal. All other characters are literals. Note: This parser is based on the ideas presented in the DrDobbs Sourcebook magazine July/August 1995 issue. The article is by Todd D. Esposito and Andrew K. Johnson. Before reading this article and entering and debugging their code, I had very little experience doing a scripting system. The concepts behind their system are more powerful than the ones I came up with. The concept behind this code is almost the same as the concepts they presented, but this implementation is more complete, and thus more usable for end users. ---------------------------------------------------------------*/ #include "StevesRgExp.h" #include <String.h> /*--------------------------------------------------------------- Creates a linked list of regular expressions representing the text. The root of the list will be returned. Pass in the text containing the expression. owner is used internally, so pass in nil. Also pass in the ID for this expression so you can figure out what expression matched the text. ---------------------------------------------------------------*/ RegExp *RegExp::Parse(char *text, RegExp *owner, long exprID) { // parse the text and determine what type of expression // is in it. Make that type of reg exp object and // continue until the text is exhausted RegExp *theExpression = nil; switch (*text++) { case '^': // beginning of line theExpression = new RBeginLine(text, exprID); break; case '$': // end of line theExpression = new REndLine(text, exprID); break; case '[': // set theExpression = new RSetExpr(text, exprID); break; case '*': // zero or more if (owner) owner->type = kReg_ZeroOrMore; return Parse(text, owner, exprID); break; case '+': // one or more if (owner) owner->type = kReg_OneOrMore; return Parse(text, owner, exprID); break; case '.': // any char theExpression = new RAnyChar(text, exprID); break; case '|': // or theExpression = new ROrExpr(text, exprID); break; case '&': // previous was a parameter if (owner) owner->parameter = true; return Parse(text, owner, exprID); break; case '/': // literalize next char, handled in RLiteral default: // literal theExpression = new RLiteral(text - 1, exprID); break; case 0: // end of text, do nothing break; } return theExpression; } /*--------------------------------------------------------------- Construct a regular expression based on some text. You MUST call next = Parse(text, this, newID); where text points to the characters that are left after you made your expression. ---------------------------------------------------------------*/ RegExp::RegExp(long newID) { ID = newID; next = nil; type = kReg_Once; // default type parameter = false; } /*--------------------------------------------------------------- Toast this object, but toast the next one first. ---------------------------------------------------------------*/ RegExp::~RegExp() { if (next) delete next; } /*--------------------------------------------------------------- Match the regular expression with some text. Pass in the text to match, the position of starting character to begin matching on, or 0 for the first character. Pass in a pointer to a short if you want to know the position of the next un-matched character, or nil if you don't. Pass in a list to hold the parameters, or nil if you don't want any parameters back. If we match, then the next expression in our list is tried. If the next one fails, then we try to match again, until we fail or the match succeeds. MatchOne() internally uses the nextChar variable to keep track of how many characters its matched, and MUST set it to the next character that was un-matched. The very first time you are called it will be -1. ---------------------------------------------------------------*/ Boolean RegExp::Match(char *text, short start, short *last, LList *paramList) { short nextChar = -1; while (1) { // can we match our own criteria? if (!MatchOne(text, start, &nextChar)) return false; // save the position of the last match if (!next && last) *last = nextChar; // can our sub expressions match? if (!next || next->Match(text, nextChar, last, paramList)) { // if we have a parameter, then put it in the params here if (parameter && paramList) { char *param = new char[64]; // paramters default to this size, but you could dynamically do it strncpy(param, text + start, nextChar - start); param[nextChar - start] = 0; // terminate the string, does strncpy? paramList->InsertItemsAt(1, arrayIndex_First, ¶m); // add it to the front of the list } return true; } } return false; } /*--------------------------------------------------------------- Match the regular expression with some text. Over-ride and return true if you match. The value of end will be preserved between calls, and will always start with -1. Start is the index of the first character to be considered. ---------------------------------------------------------------*/ Boolean RegExp::MatchOne(char *text, short start, short *end) { *end = start; return false; } /*--------------------------------------------------------------- Construct a regular expression based on some text. ---------------------------------------------------------------*/ RBeginLine::RBeginLine(char *text, long newID) : RegExp(newID) { // nothing to do, make the next one in our list. next = Parse(text, this, newID); } /*--------------------------------------------------------------- Match the regular expression with some text. ---------------------------------------------------------------*/ Boolean RBeginLine::MatchOne(char *text, short start, short *end) { if (*end == -1 && !start) // we only match if we are at the start of the line, ie start = 0 { *end = start; return true; } return false; } /*--------------------------------------------------------------- Construct a regular expression based on some text. ---------------------------------------------------------------*/ REndLine::REndLine(char *text, long newID) : RegExp(newID) { // nothing to do, make the next one in our list. next = Parse(text, this, newID); } /*--------------------------------------------------------------- Match the regular expression with some text. ---------------------------------------------------------------*/ Boolean REndLine::MatchOne(char *text, short start, short *end) { // we only match if there are no more characters left in this line if (*end == -1 && (text[start] == 0 || text[start] == '\r')) { *end = start; return true; } return false; } /*--------------------------------------------------------------- Construct a regular expression based on some text. This makes an expression that can match a set. Simply uses an array of booleans to keep track of which characters are in the set. Could be better, but... The text should be "[...]" where ... can be any individual characters. you can also specify a range with the '-' character. Use '~' when you want to remove some chars from the set. [a-zA-Z~dD] matches all alphabetical chars except d and D ---------------------------------------------------------------*/ RSetExpr::RSetExpr(char *text, long newID) : RegExp(newID) { // remove the set from the text for (int x = 0; x < 256; ++x) charSet[x] = 0; char state = 1; char prevChar = 0; while (*text && *text != ']') { if (*text == '/') // quote the next character, ie the ']', or the '/' ++text; if (*text == '-' && prevChar && *(text + 1)) // set a whole range { ++text; for (int x = prevChar; x <= *text; ++x) charSet[x] = state; } else if (*text == '~') state = 0; else charSet[*text] = state; prevChar = *text; ++text; // next character } if (*text) // skip the ']' ++text; // make the next one in our list. next = Parse(text, this, newID); } /*--------------------------------------------------------------- Match the regular expression with some text. ---------------------------------------------------------------*/ Boolean RSetExpr::MatchOne(char *text, short start, short *end) { if (type == kReg_Once && *end != -1) // end is -1 the first time, so if it is not 0 then... return false; if (type == kReg_ZeroOrMore && *end == -1)// we first try matching 0 { *end = start; return true; } if (*end == -1) // the first time through, try only the first char *end = start; for (int x = start; x <= *end; ++x) if (!charSet[text[x]]) return false; *end = *end + 1; // we matched this char, so move end. return true; } /*--------------------------------------------------------------- Construct a regular expression based on some text. ---------------------------------------------------------------*/ RAnyChar::RAnyChar(char *text, long newID) : RegExp(newID) { // make the next one in our list. next = Parse(text, this, newID); } /*--------------------------------------------------------------- Match the regular expression with some text. ---------------------------------------------------------------*/ Boolean RAnyChar::MatchOne(char *text, short start, short *end) { if (type == kReg_Once && *end != -1) // end is -1 the first time, so if it is not 0 then... return false; if (type == kReg_ZeroOrMore && *end == -1)// we first try matching 0 { *end = start; return true; } if (*end == -1) // the first time through, try only the first char *end = start; // since we match anything, we do not need to make any checks here // EXCEPT to see if we are at the end of the string if (!text[*end]) return false; // no more chars to match *end = *end + 1; // we matched this char, so move end. return true; } /*--------------------------------------------------------------- Construct a regular expression based on some text. ---------------------------------------------------------------*/ ROrExpr::ROrExpr(char *text, long newID) : RegExp(newID) { // NOT IMPLEMENTED yet // make the next one in our list. next = Parse(text, this, newID); } /*--------------------------------------------------------------- Match the regular expression with some text. ---------------------------------------------------------------*/ Boolean ROrExpr::MatchOne(char *text, short start, short *end) { return false; // We do not implement OR yet (how would we?) } /*--------------------------------------------------------------- Construct a regular expression based on some text. Collect characters until a meta character is encountered. This is our literal. ---------------------------------------------------------------*/ RLiteral::RLiteral(char *text, long newID) : RegExp(newID) { Boolean done = false; short index = 0; while (!done) { switch (*text) { case 0: case '^': // beginning of line case '$': // end of line case '[': // set case '*': // zero or more case '+': // one or more case '.': // any char case '|': // or done = true; break; case '/': // literalize next char ++text; // skip the slash and drop into the literal code default: // literal buffer[index++] = *text++; break; } } buffer[index] = 0; // terminate string next = Parse(text, this, newID); } /*--------------------------------------------------------------- Match the regular expression with some text. Match the literal possible 0 or more times. ---------------------------------------------------------------*/ Boolean RLiteral::MatchOne(char *text, short start, short *end) { short x; if (type == kReg_Once && *end != -1) // end is -1 the first time, so if it is not 0 then... return false; if (type == kReg_ZeroOrMore && *end == -1) // we first try matching 0 { *end = start; return true; } if (*end == -1) // the first time through, try only the first set *end = start; for (x = start; x <= *end;) { short i = 0; while (buffer[i]) // match the entire buffer, return false if we hit the end of the string if (!text[x] || (buffer[i++] != text[x++])) return false; } *end = x; // the end has moved return true; }